Análisis de https://www.nature.com/articles/srep00196.pdf
Podemos usar read_lines_chunked si el archivo original es grande. En este ejemplo, filtramos las recetas Latin American:
library(tidyverse)
limpiar <- function(lineas, ...){
str_split(lineas, ',') |>
keep(~.x[1] == 'LatinAmerican') |>
map(~.x[-1]) |> # quitar tipo de cocina
map(~.x[nchar(.x) > 0]) # quitar elementos vacios
}
callback_limpiar <- ListCallback$new(limpiar)
filtrado <- read_lines_chunked('../datos/recetas/srep00196-s3.csv',
skip = 1, callback = callback_limpiar, chunk_size = 1000)
recetas <- filtrado |> flatten()
recetas[1:10]
## [[1]]
## [1] "tomato" "cilantro" "lemon_juice"
## [4] "onion" "green_bell_pepper" "cayenne"
##
## [[2]]
## [1] "olive_oil" "pepper" "lemon_juice" "wheat"
## [5] "onion" "vinegar" "asparagus" "parsley"
## [9] "white_wine" "garlic" "bell_pepper" "oregano"
## [13] "basil" "vegetable_oil" "chicken"
##
## [[3]]
## [1] "tomato" "pepper" "potato" "lime_juice" "cayenne"
## [6] "cumin" "scallion" "cilantro" "cream"
##
## [[4]]
## [1] "tomato" "beef" "onion" "cayenne"
## [5] "corn" "black_pepper" "cumin" "buttermilk"
## [9] "bean" "garlic" "bell_pepper" "oregano"
## [13] "vegetable_oil" "egg" "cream"
##
## [[5]]
## [1] "tomato" "garlic" "onion" "beef" "cayenne"
## [6] "cumin" "bell_pepper"
##
## [[6]]
## [1] "olive_oil" "cilantro" "wheat"
## [4] "onion" "cayenne" "cumin"
## [7] "lettuce" "garlic" "bell_pepper"
## [10] "soybean" "mozzarella_cheese" "lime"
## [13] "turmeric"
##
## [[7]]
## [1] "butter" "cheese" "ham" "onion" "potato" "cayenne" "garlic"
## [8] "tomato"
##
## [[8]]
## [1] "cane_molasses" "olive_oil" "pepper" "red_wine"
## [5] "tabasco_pepper" "seed" "cucumber" "oregano"
## [9] "olive" "tamarind" "bread" "tomato"
## [13] "vinegar" "lemon" "onion" "parsley"
## [17] "tomato_juice" "garlic" "beef_broth" "egg"
##
## [[9]]
## [1] "butter" "tomato" "garlic" "onion" "chicken" "rice" "cayenne"
##
## [[10]]
## [1] "cane_molasses" "butter" "lemon_juice" "wheat"
## [5] "apple" "plum" "milk" "pecan"
## [9] "cinnamon" "orange" "tequila" "cream"
length(recetas)
## [1] 2917
library(arules)
length(recetas)
## [1] 2917
## No hacer mucho más chico que este soporte, pues tenemos relativamente
## pocas transacciones:
pars <- list(support = 0.05, target = 'frequent itemsets',
ext = TRUE)
ap_recetas <- apriori(recetas, parameter = pars)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## NA 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 frequent itemsets TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 145
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[260 item(s), 2917 transaction(s)] done [0.00s].
## sorting and recoding items ... [37 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
## sorting transactions ... done [0.00s].
## writing ... [759 set(s)] done [0.00s].
## creating S4 object ... done [0.00s].
length(ap_recetas)
## [1] 759
Vemos los items frecuentes
ap_1 <- subset(ap_recetas, size(ap_recetas) == 1)
frecs <- ap_1 |> sort(by = 'support') |> DATAFRAME()
DT::datatable(frecs |> mutate_if(is.numeric, function(x) round(x, 3)))
Y ahora examinamos combinaciones frecuentes de distintos tamaños
ap_2 <- subset(ap_recetas, size(ap_recetas) == 2)
ap_2 |>
subset(support > 0.20) |>
sort(by = 'support') |>
inspect()
## items support count
## [1] {cayenne, onion} 0.5128557 1496
## [2] {garlic, onion} 0.4991430 1456
## [3] {cayenne, garlic} 0.4844018 1413
## [4] {onion, tomato} 0.4586904 1338
## [5] {cayenne, tomato} 0.4381214 1278
## [6] {garlic, tomato} 0.4220089 1231
## [7] {cumin, garlic} 0.2451148 715
## [8] {cayenne, cumin} 0.2430579 709
## [9] {cumin, onion} 0.2358588 688
## [10] {cayenne, corn} 0.2324306 678
## [11] {corn, onion} 0.2159753 630
## [12] {corn, tomato} 0.2015770 588
Incluso hay algunas combinaciones de 4 ingredientes que ocurren con frecuencia alta: estos ingredientes son bases de salsas, combinaciones de condimentos:
ap_4 <- subset(ap_recetas, size(ap_recetas) == 4)
ap_4 |>
subset(support > 0.10) |>
sort(by = 'support') |>
inspect()
## items support count
## [1] {cayenne, garlic, onion, tomato} 0.3345903 976
## [2] {cayenne, cumin, garlic, onion} 0.1960919 572
## [3] {cumin, garlic, onion, tomato} 0.1642098 479
## [4] {cayenne, cumin, onion, tomato} 0.1631814 476
## [5] {cayenne, cumin, garlic, tomato} 0.1624957 474
## [6] {cayenne, corn, garlic, onion} 0.1580391 461
## [7] {corn, garlic, onion, tomato} 0.1467261 428
## [8] {cayenne, corn, onion, tomato} 0.1463833 427
## [9] {cayenne, corn, garlic, tomato} 0.1443264 421
## [10] {cayenne, garlic, onion, oregano} 0.1343846 392
## [11] {black_pepper, cayenne, garlic, onion} 0.1278711 373
## [12] {cayenne, garlic, onion, vegetable_oil} 0.1227288 358
## [13] {cayenne, cilantro, garlic, onion} 0.1182722 345
## [14] {bell_pepper, cayenne, garlic, onion} 0.1162153 339
## [15] {garlic, onion, tomato, vegetable_oil} 0.1145012 334
## [16] {cayenne, onion, tomato, vegetable_oil} 0.1127871 329
## [17] {cayenne, cumin, garlic, oregano} 0.1114158 325
## [18] {cumin, garlic, onion, oregano} 0.1107302 323
## [19] {cayenne, cilantro, onion, tomato} 0.1100446 321
## [20] {cayenne, cumin, onion, oregano} 0.1083305 316
## [21] {cayenne, garlic, tomato, vegetable_oil} 0.1062736 310
## [22] {bell_pepper, garlic, onion, tomato} 0.1045595 305
## [23] {cayenne, cilantro, garlic, tomato} 0.1042167 304
## [24] {cayenne, cheddar_cheese, garlic, onion} 0.1038738 303
## [25] {garlic, onion, oregano, tomato} 0.1038738 303
## [26] {cayenne, cheese, garlic, onion} 0.1035310 302
## [27] {black_pepper, garlic, onion, tomato} 0.1031882 301
## [28] {bell_pepper, cayenne, onion, tomato} 0.1025026 299
## [29] {cilantro, garlic, onion, tomato} 0.1021598 298
## [30] {cayenne, cheddar_cheese, onion, tomato} 0.1018169 297
## [31] {black_pepper, cumin, garlic, onion} 0.1018169 297
## [32] {black_pepper, cayenne, onion, tomato} 0.1011313 295
## [33] {cayenne, garlic, oregano, tomato} 0.1007885 294
## [34] {cayenne, onion, oregano, tomato} 0.1004457 293
pars <- list(support = 0.01, confidence = 0.10,
target = 'rules',
ext = TRUE)
reglas_recetas <- apriori(recetas, parameter = pars)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.1 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 29
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[260 item(s), 2917 transaction(s)] done [0.00s].
## sorting and recoding items ... [101 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 8 9 10
## Warning in apriori(recetas, parameter = pars): Mining stopped (maxlen reached).
## Only patterns up to a length of 10 returned!
## done [0.02s].
## writing ... [58318 rule(s)] done [0.01s].
## creating S4 object ... done [0.01s].
agregar_hyperlift <- function(reglas, trans){
quality(reglas) <- cbind(quality(reglas),
hyper_lift = interestMeasure(reglas, measure = "hyperLift",
transactions = trans))
reglas
}
reglas_recetas <- agregar_hyperlift(reglas_recetas, recetas)
library(arulesViz)
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.1 & support > 0.1 & confidence > 0.40)
length(reglas_1)
## [1] 341
reglas_tam_2 <- subset(reglas_1, size(reglas_1)==2)
#inspect(reglas_tam_2 |> sort(by = 'hyper_lift'))
plot(reglas_1 |> subset(support > 0.2), engine = "plotly")
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
##
## filter
library(ggraph)
df_reglas <- reglas_tam_2 |> DATAFRAME() |> rename(from=LHS, to=RHS) |> data.frame()
df_reglas$weight <- log(df_reglas$lift)
graph_1 <- as_tbl_graph(df_reglas) |>
mutate(centrality = centrality_degree(mode = "all"))
set.seed(881)
ggraph(graph_1, layout = 'fr') +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
reglas_1 <- subset(reglas_recetas, hyper_lift > 1.5 & confidence > 0.1)
length(reglas_1)
## [1] 16244
reglas_tam_2 <- subset(reglas_1, size(reglas_1) == 2)
length(reglas_tam_2)
## [1] 135
library(tidygraph)
library(ggraph)
df_reglas <- reglas_tam_2 |> DATAFRAME() |>
rename(from=LHS, to=RHS) |> as_data_frame()
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
df_reglas$weight <- log(df_reglas$hyper_lift)
graph_1 <- as_tbl_graph(df_reglas) |>
mutate(centrality = centrality_degree(mode = "all"))
ggraph(graph_1, layout = 'fr', start.temp=100) +
geom_edge_link(aes(alpha=lift),
colour = 'red',
arrow = arrow(length = unit(4, 'mm'))) +
geom_node_point(aes(size = centrality, colour = centrality)) +
geom_node_text(aes(label = name), size=4,
colour = 'gray20', repel=TRUE) +
theme_graph(base_family = "sans")
## Warning: ggrepel: 2 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Exportamos para examinar en Gephi:
#write_csv(df_reglas |> rename(source=from, target=to) |>
# select(-count), 'reglas.csv')
La combinación corn y starch puede deberse en parte a una separación incorrecta en el procesamiento de los datos (corn starch o maizena convertido en dos ingredientes, corn y starch):
#df_reglas |> filter(from == "{corn}", to == "{starch}")
La confianza es considerablemente alta, aunque tenemos pocos datos de esta combinación. Podemos examinar algunos ejemplos:
#recetas |> keep(~ "starch" %in% .x & "corn" %in% .x) |> head(10)